Introduction

This report analyzes the BostonHousing dataset, which contains information about housing in Boston. We will explore the relationships between several variables and the median home value (medv).


1. Loading Libraries and Data

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

Histogram on Medv

hist(BostonHousing$medv, breaks = 15)

On this histogram, we can see that the distribution is not normal. The mode is at arout 20 to 25.

Linear regression

pairs(
  data = BostonHousing,
  ~.
)

lin_reg <- lm(medv ~ rm, data = BostonHousing)
ggplot(data = BostonHousing, aes(x = medv, y = rm)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Linear Multiple Regression

Backward Elimination

model <- lm(medv ~ ., data = BostonHousing)

summary(model)
## 
## Call:
## lm(formula = medv ~ ., data = BostonHousing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## b            9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
backward_model <- step(model, direction = "backward", trace = FALSE)

summary(backward_model)
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + b + lstat, data = BostonHousing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
## crim         -0.108413   0.032779  -3.307 0.001010 ** 
## zn            0.045845   0.013523   3.390 0.000754 ***
## chas          2.718716   0.854240   3.183 0.001551 ** 
## nox         -17.376023   3.535243  -4.915 1.21e-06 ***
## rm            3.801579   0.406316   9.356  < 2e-16 ***
## dis          -1.492711   0.185731  -8.037 6.84e-15 ***
## rad           0.299608   0.063402   4.726 3.00e-06 ***
## tax          -0.011778   0.003372  -3.493 0.000521 ***
## ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
## b             0.009291   0.002674   3.475 0.000557 ***
## lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16
plot(backward_model)

Perfect linear regression

lin_reg <- lm(medv ~ medv, data = BostonHousing)
## Warning in model.matrix.default(mt, mf, contrasts): la réponse est apparue dans
## le membre de droite et y a été éliminée
## Warning in model.matrix.default(mt, mf, contrasts): problème avec le terme 1
## dans model.matrix : aucune colonne n'est assignée
plot(lin_reg)
## Warning in model.matrix.default(object, data = structure(list(medv = c(24, : la
## réponse est apparue dans le membre de droite et y a été éliminée
## Warning in model.matrix.default(object, data = structure(list(medv = c(24, :
## problème avec le terme 1 dans model.matrix : aucune colonne n'est assignée

## les valeurs estimées ("leverages") sont tous = 0.001976285
##  et il n'y a aucune variable dépendante facteur ; aucun graphe no. 5

Worst linear regression

lin_reg <- lm(medv ~ indus, data = BostonHousing)
plot(lin_reg)

Can you make a box plot of the value of the rivers in the BostonHousing dataset?

ggplot(BostonHousing, aes(x = factor(chas), y = medv, fill = factor(chas))) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Boxplot of Median Home Values by River Proximity",
       x = "Proximity to the River (0 = No, 1 = Yes)",
       y = "Median Home Value (in $1000s)",
       fill = "Proximity") +
  scale_fill_manual(values = c("0" = "red", "1" = "blue")) +
  theme_minimal()

Observation: Proximity to the river seems to have a slight impact on home prices, but the distribution remains fairly similar.


5. Distribution of Key Variables

We analyze the distribution of home prices (medv).


6. Correlations Between Variables

We compute the correlation matrix and display values associated with medv.

# Compute correlation matrix
cor_matrix <- cor(BostonHousing)

# Display correlations with MEDV
cor_matrix["medv", ]
##       crim         zn      indus       chas        nox         rm        age 
## -0.3883046  0.3604453 -0.4837252  0.1752602 -0.4273208  0.6953599 -0.3769546 
##        dis        rad        tax    ptratio          b      lstat       medv 
##  0.2499287 -0.3816262 -0.4685359 -0.5077867  0.3334608 -0.7376627  1.0000000

Observations:
- Positive correlation with rm (number of rooms).
- Negative correlation with lstat (percentage of lower-status population).
- tax and crim have a negative relationship with medv.


6.1 Key Relationships

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'


7. Impact of Crime and Taxation on Home Prices

7.1 Distribution of crim and tax

Observations:
- crim is highly skewed: most neighborhoods have low crime rates, but some areas have extreme values.
- tax shows distinct peaks, suggesting fixed tax rates in certain areas.


7.2 Relationship Between Crime Rate and Home Prices

## `geom_smooth()` using formula = 'y ~ x'

Observation: Home prices decrease as crime rates increase.


7.3 Relationship Between Taxation and Home Prices

## `geom_smooth()` using formula = 'y ~ x'

Observation: A higher tax rate is associated with lower home prices.


8. Multiple Regression Analysis

## 
## Call:
## lm(formula = medv ~ crim + tax, data = BostonHousing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.005  -4.929  -2.099   2.945  33.676 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 31.379119   1.033523  30.361  < 2e-16 ***
## crim        -0.186617   0.051156  -3.648 0.000292 ***
## tax         -0.020018   0.002611  -7.667 9.15e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.036 on 503 degrees of freedom
## Multiple R-squared:  0.2396, Adjusted R-squared:  0.2366 
## F-statistic: 79.27 on 2 and 503 DF,  p-value: < 2.2e-16

Results:
- The coefficients for crim and tax are negative, confirming their negative impact on medv.
- p-values < 0.05 indicate these effects are statistically significant.


9. Separate data between train and test

# Load necessary libraries
set.seed(123)
sample_size <- floor(0.8 * nrow(BostonHousing))
train_indices <- sample(seq_len(nrow(BostonHousing)), size = sample_size)
train_data <- BostonHousing[train_indices, ]
test_data <- BostonHousing[-train_indices, ]

initial_model <- lm(medv ~ 1, data = train_data)

9.1. Forward Selection

forward_model <- stepAIC(initial_model, direction = "forward", scope = list(lower=initial_model, upper=~crim+ zn+ indus+chas +nox+rm+ age +dis+rad+tax+ptratio+b+lstat ))
## Start:  AIC=1794.1
## medv ~ 1
## 
##           Df Sum of Sq   RSS    AIC
## + lstat    1   18882.8 15226 1470.2
## + rm       1   16276.0 17832 1534.1
## + ptratio  1    9523.9 24584 1663.8
## + tax      1    7627.9 26480 1693.8
## + indus    1    7324.6 26784 1698.4
## + nox      1    5913.2 28195 1719.2
## + rad      1    5021.7 29087 1731.8
## + crim     1    5018.5 29090 1731.8
## + age      1    4779.6 29329 1735.1
## + zn       1    4293.4 29815 1741.7
## + b        1    3449.2 30659 1753.0
## + dis      1    2093.9 32014 1770.5
## + chas     1    1056.1 33052 1783.4
## <none>                 34108 1794.1
## 
## Step:  AIC=1470.24
## medv ~ lstat
## 
##           Df Sum of Sq   RSS    AIC
## + rm       1   2941.58 12284 1385.5
## + ptratio  1   2209.92 13016 1408.9
## + chas     1    796.68 14429 1450.5
## + dis      1    737.33 14488 1452.2
## + age      1    295.97 14930 1464.3
## + tax      1    158.27 15067 1468.0
## + crim     1     81.23 15144 1470.1
## + zn       1     79.62 15146 1470.1
## <none>                 15226 1470.2
## + b        1     63.28 15162 1470.6
## + indus    1     37.42 15188 1471.2
## + nox      1     18.58 15207 1471.8
## + rad      1      3.99 15222 1472.1
## 
## Step:  AIC=1385.51
## medv ~ lstat + rm
## 
##           Df Sum of Sq   RSS    AIC
## + ptratio  1   1337.28 10947 1341.0
## + chas     1    641.96 11642 1365.8
## + dis      1    419.55 11864 1373.5
## + b        1    201.23 12083 1380.8
## + tax      1    178.43 12106 1381.6
## + crim     1    170.05 12114 1381.9
## + age      1     75.82 12208 1385.0
## <none>                 12284 1385.5
## + rad      1     43.67 12240 1386.1
## + zn       1     19.76 12264 1386.9
## + indus    1      8.34 12276 1387.2
## + nox      1      0.25 12284 1387.5
## 
## Step:  AIC=1340.95
## medv ~ lstat + rm + ptratio
## 
##         Df Sum of Sq   RSS    AIC
## + dis    1    546.03 10401 1322.3
## + chas   1    414.26 10532 1327.4
## + b      1    169.40 10777 1336.7
## + age    1    131.58 10815 1338.1
## + crim   1     59.78 10887 1340.7
## <none>               10947 1341.0
## + rad    1     47.26 10899 1341.2
## + zn     1     22.12 10924 1342.1
## + indus  1     10.28 10936 1342.6
## + tax    1      2.79 10944 1342.8
## + nox    1      0.45 10946 1342.9
## 
## Step:  AIC=1322.27
## medv ~ lstat + rm + ptratio + dis
## 
##         Df Sum of Sq     RSS    AIC
## + nox    1    499.37  9901.2 1304.4
## + chas   1    295.49 10105.1 1312.6
## + b      1    237.57 10163.0 1314.9
## + indus  1    196.54 10204.1 1316.6
## + zn     1    156.10 10244.5 1318.2
## + crim   1    142.85 10257.8 1318.7
## + tax    1    119.46 10281.2 1319.6
## <none>               10400.6 1322.3
## + age    1     25.87 10374.7 1323.3
## + rad    1      0.42 10400.2 1324.3
## 
## Step:  AIC=1304.4
## medv ~ lstat + rm + ptratio + dis + nox
## 
##         Df Sum of Sq    RSS    AIC
## + chas   1    350.10 9551.1 1291.8
## + zn     1    153.36 9747.9 1300.1
## + b      1    150.70 9750.6 1300.2
## + rad    1     89.12 9812.1 1302.7
## + crim   1     86.99 9814.3 1302.8
## <none>               9901.2 1304.4
## + indus  1     19.25 9882.0 1305.6
## + age    1      1.44 9899.8 1306.3
## + tax    1      0.80 9900.4 1306.4
## 
## Step:  AIC=1291.85
## medv ~ lstat + rm + ptratio + dis + nox + chas
## 
##         Df Sum of Sq    RSS    AIC
## + zn     1   160.673 9390.5 1287.0
## + b      1   125.439 9425.7 1288.5
## + rad    1   108.724 9442.4 1289.2
## + crim   1    64.013 9487.1 1291.1
## <none>               9551.1 1291.8
## + indus  1    28.105 9523.0 1292.7
## + tax    1     1.193 9550.0 1293.8
## + age    1     0.436 9550.7 1293.8
## 
## Step:  AIC=1287
## medv ~ lstat + rm + ptratio + dis + nox + chas + zn
## 
##         Df Sum of Sq    RSS    AIC
## + b      1   142.489 9248.0 1282.8
## + crim   1   104.162 9286.3 1284.5
## + rad    1    66.011 9324.5 1286.2
## <none>               9390.5 1287.0
## + indus  1    28.314 9362.2 1287.8
## + age    1     6.578 9383.9 1288.7
## + tax    1     5.502 9385.0 1288.8
## 
## Step:  AIC=1282.82
## medv ~ lstat + rm + ptratio + dis + nox + chas + zn + b
## 
##         Df Sum of Sq    RSS    AIC
## + rad    1   115.883 9132.1 1279.7
## + crim   1    64.248 9183.7 1282.0
## <none>               9248.0 1282.8
## + indus  1    19.159 9228.8 1284.0
## + age    1     2.476 9245.5 1284.7
## + tax    1     0.018 9248.0 1284.8
## 
## Step:  AIC=1279.73
## medv ~ lstat + rm + ptratio + dis + nox + chas + zn + b + rad
## 
##         Df Sum of Sq    RSS    AIC
## + crim   1   188.587 8943.5 1273.3
## + tax    1   186.216 8945.9 1273.4
## <none>               9132.1 1279.7
## + indus  1    33.416 9098.7 1280.2
## + age    1     7.356 9124.7 1281.4
## 
## Step:  AIC=1273.3
## medv ~ lstat + rm + ptratio + dis + nox + chas + zn + b + rad + 
##     crim
## 
##         Df Sum of Sq    RSS    AIC
## + tax    1   199.421 8744.1 1266.2
## <none>               8943.5 1273.3
## + indus  1    39.008 8904.5 1273.5
## + age    1     7.751 8935.8 1275.0
## 
## Step:  AIC=1266.19
## medv ~ lstat + rm + ptratio + dis + nox + chas + zn + b + rad + 
##     crim + tax
## 
##         Df Sum of Sq    RSS    AIC
## <none>               8744.1 1266.2
## + age    1   12.7502 8731.3 1267.6
## + indus  1    1.1286 8743.0 1268.1
plot(forward_model, which=2)

summary(forward_model)
## 
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas + 
##     zn + b + rad + crim + tax, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4118  -2.6786  -0.5805   1.7247  25.1597 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.828112   5.692801   6.996 1.14e-11 ***
## lstat        -0.575216   0.052701 -10.915  < 2e-16 ***
## rm            3.481624   0.461829   7.539 3.33e-13 ***
## ptratio      -0.975927   0.144148  -6.770 4.71e-11 ***
## dis          -1.535931   0.206057  -7.454 5.86e-13 ***
## nox         -17.406957   3.923249  -4.437 1.19e-05 ***
## chas          3.150152   0.913474   3.449 0.000625 ***
## zn            0.046922   0.015137   3.100 0.002075 ** 
## b             0.007637   0.003160   2.416 0.016128 *  
## rad           0.302551   0.068610   4.410 1.34e-05 ***
## crim         -0.103341   0.034359  -3.008 0.002802 ** 
## tax          -0.010953   0.003663  -2.990 0.002966 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.723 on 392 degrees of freedom
## Multiple R-squared:  0.7436, Adjusted R-squared:  0.7364 
## F-statistic: 103.4 on 11 and 392 DF,  p-value: < 2.2e-16
test_predictions <- predict(forward_model, newdata = test_data)

actual_vs_predicted <- data.frame(Actual = test_data$medv, Predicted = test_predictions)

residuals <- data.frame(Predicted = test_predictions, Residuals = test_data$medv - test_predictions)

ggplot(residuals, aes(x = Predicted, y = Residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residual Plot", x = "Predicted Values", y = "Residuals") +
  theme_minimal() +
  # Ensure the full range of predicted values is shown
  coord_cartesian(xlim = range(test_predictions))

correlation_matrix <- cor(train_data)

9.2. Correlation Matrix

ggplot(reshape2::melt(correlation_matrix), aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
  coord_fixed()

9.3. Random Forest Model

set.seed(123)
rf_model <- randomForest(medv ~ ., data = train_data, importance = TRUE)
rf_predictions <- predict(rf_model, newdata = test_data)
# Evaluate the model
actual_vs_predicted_rf <- data.frame(Actual = test_data$medv, Predicted = rf_predictions)

# Plot actual vs predicted values
ggplot(actual_vs_predicted_rf, aes(x = Actual, y = Predicted)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Actual vs Predicted Values (Random Forest)", x = "Actual Values", y = "Predicted Values") +
  theme_minimal()

9.4. Decision Tree

# Plot a tree using rpart
library(rpart)
tree_model <- rpart(medv ~ ., data = train_data)

library(rpart.plot)
rpart.plot(tree_model, cex = 0.8)